In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from keras.layers import Dense, BatchNormalization, Dropout, LSTM
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from keras import callbacks

np.random.seed(0)
In [2]:
data=pd.read_csv("C:\\Users\\91788\\Downloads\\weatherAUS.csv")
data.head()
Out[2]:
Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow
0 2008-12-01 Albury 13.4 22.9 0.6 NaN NaN W 44.0 W ... 71.0 22.0 1007.7 1007.1 8.0 NaN 16.9 21.8 No No
1 2008-12-02 Albury 7.4 25.1 0.0 NaN NaN WNW 44.0 NNW ... 44.0 25.0 1010.6 1007.8 NaN NaN 17.2 24.3 No No
2 2008-12-03 Albury 12.9 25.7 0.0 NaN NaN WSW 46.0 W ... 38.0 30.0 1007.6 1008.7 NaN 2.0 21.0 23.2 No No
3 2008-12-04 Albury 9.2 28.0 0.0 NaN NaN NE 24.0 SE ... 45.0 16.0 1017.6 1012.8 NaN NaN 18.1 26.5 No No
4 2008-12-05 Albury 17.5 32.3 1.0 NaN NaN W 41.0 ENE ... 82.0 33.0 1010.8 1006.0 7.0 8.0 17.8 29.7 No No

5 rows × 23 columns

In [19]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null   float64
 18  Cloud3pm       86102 non-null   float64
 19  Temp9am        143693 non-null  float64
 20  Temp3pm        141851 non-null  float64
 21  RainToday      142199 non-null  object 
 22  RainTomorrow   142193 non-null  object 
dtypes: float64(16), object(7)
memory usage: 25.5+ MB
In [3]:
data.isnull().sum()
Out[3]:
Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64
In [4]:
data.shape
Out[4]:
(145460, 23)
In [6]:
data.Location .dtype
Out[6]:
dtype('O')
In [7]:
# Eda
In [8]:
data.shape # rows and column counts gives
Out[8]:
(145460, 23)
In [9]:
data.describe()
Out[9]:
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm
count 143975.000000 144199.000000 142199.000000 82670.000000 75625.000000 135197.000000 143693.000000 142398.000000 142806.000000 140953.000000 130395.00000 130432.000000 89572.000000 86102.000000 143693.000000 141851.00000
mean 12.194034 23.221348 2.360918 5.468232 7.611178 40.035230 14.043426 18.662657 68.880831 51.539116 1017.64994 1015.255889 4.447461 4.509930 16.990631 21.68339
std 6.398495 7.119049 8.478060 4.193704 3.785483 13.607062 8.915375 8.809800 19.029164 20.795902 7.10653 7.037414 2.887159 2.720357 6.488753 6.93665
min -8.500000 -4.800000 0.000000 0.000000 0.000000 6.000000 0.000000 0.000000 0.000000 0.000000 980.50000 977.100000 0.000000 0.000000 -7.200000 -5.40000
25% 7.600000 17.900000 0.000000 2.600000 4.800000 31.000000 7.000000 13.000000 57.000000 37.000000 1012.90000 1010.400000 1.000000 2.000000 12.300000 16.60000
50% 12.000000 22.600000 0.000000 4.800000 8.400000 39.000000 13.000000 19.000000 70.000000 52.000000 1017.60000 1015.200000 5.000000 5.000000 16.700000 21.10000
75% 16.900000 28.200000 0.800000 7.400000 10.600000 48.000000 19.000000 24.000000 83.000000 66.000000 1022.40000 1020.000000 7.000000 7.000000 21.600000 26.40000
max 33.900000 48.100000 371.000000 145.000000 14.500000 135.000000 130.000000 87.000000 100.000000 100.000000 1041.00000 1039.600000 9.000000 9.000000 40.200000 46.70000
In [10]:
plt.figure(figsize = (20,10))
sns.heatmap(data.corr(),annot = True)
plt.show()
C:\Users\91788\AppData\Local\Temp\ipykernel_16540\256226407.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(data.corr(),annot = True)
In [12]:
sns.pairplot(data)
Out[12]:
<seaborn.axisgrid.PairGrid at 0x238856fbdc0>
In [ ]:
pca = PCA(svd_solver='randomized', random_state=42)


# fiting PCA on the dataset
pca.fit(Country_scaled)
In [ ]:
pca.components_
In [ ]:
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
In [ ]:
from sklearn.decomposition import IncrementalPCA
pca_final = IncrementalPCA(n_components=5)
In [ ]:
df_pca = pca_final.fit_transform(Country_scaled)
df_pca.shape
In [ ]:
pc = np.transpose(df_pca)
In [ ]:
pca.explained_variance_ratio_
In [ ]: